Set/check knitR option and working directory

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.0     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(here)
## here() starts at /home/rguerillot/Documents/Travail/Abdou_project/Staph_infection_project/github_analysis/VANANZ_phenotypes
library(adegenet)
## Loading required package: ade4
## Registered S3 method overwritten by 'spdep':
##   method   from
##   plot.mst ape
## 
##    /// adegenet 2.1.1 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(corrplot)
## corrplot 0.84 loaded
library(RColorBrewer)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(cluster)

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = here())
print(paste("My working directory is:" ,here()))
## [1] "My working directory is: /home/rguerillot/Documents/Travail/Abdou_project/Staph_infection_project/github_analysis/VANANZ_phenotypes"

Generate parameter index by correcting parameters by JE2 for each experiment

source("Functions/all_functions.R")
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
JE2_GC_median_param_long <-  read.csv("Ideas_Grant_2020_analysis/Processed_data/Growth_curves/processed_parameters_GC.csv") %>%
  filter(sample_id == "JE2") %>%
  select_if(grepl("OD", names(.)) | grepl("sample_id", names(.)) | grepl("experiment", names(.))) %>%
  group_by(experiment) %>%
  summarise_if(is.numeric, median) %>%
  ungroup() %>%
  pivot_longer(ends_with("OD")) %>%
  rename(JE2_value = value)

JE2_PI_median_param_long <-  read.csv("Ideas_Grant_2020_analysis/Processed_data/PI_curves/processed_parameters_PI.csv") %>%
  filter(sample_id == "JE2") %>%
  select_if(grepl("death", names(.)) | grepl("sample_id", names(.)) | grepl("experiment", names(.))) %>%
  group_by(experiment) %>%
  summarise_if(is.numeric, median) %>%
  ungroup() %>%
  pivot_longer(ends_with("death")) %>%
  rename(JE2_value = value)

all_GC_param_JE2_corrected <-  read.csv("Ideas_Grant_2020_analysis/Processed_data/Growth_curves/processed_parameters_GC.csv") %>%
  filter(strain_group != "CONTROL") %>%
  select_if(grepl("OD", names(.)) | grepl("sample_id", names(.)) | grepl("experiment", names(.))) %>%
  pivot_longer(ends_with("OD")) %>%
  merge(., JE2_GC_median_param_long, by = c("experiment", "name"), all.x = T, all.y = F) %>%
  filter(!is.na(JE2_value)) %>%
  mutate(value = value/JE2_value) %>%
  group_by(sample_id, name) %>%
  summarise_if(is.numeric, median) %>%
  ungroup() %>%
  select(-JE2_value) %>%
  pivot_wider()

all_PI_param_JE2_corrected <-  read.csv("Ideas_Grant_2020_analysis/Processed_data/PI_curves/processed_parameters_PI.csv") %>%
  filter(strain_group != "CONTROL") %>%
  select_if(grepl("death", names(.)) | grepl("sample_id", names(.)) | grepl("experiment", names(.))) %>%
  pivot_longer(ends_with("death")) %>%
  merge(., JE2_PI_median_param_long, by = c("experiment", "name"), all.x = T, all.y = F) %>%
  filter(!is.na(JE2_value))  %>%
  mutate(value = value/JE2_value) %>%
  group_by(sample_id, name) %>%
  summarise_if(is.numeric, median) %>%
  ungroup() %>%
  select(-JE2_value) %>%
  pivot_wider()

merge corrected data

sample_meta.df <- read.csv("Ideas_Grant_2020_analysis/Raw_data/strain_metadata_corrected_mortality_with_controls.csv") 

median_sample_parameters_all.df <- merge(all_GC_param_JE2_corrected, all_PI_param_JE2_corrected, by = "sample_id") %>% 
  select(-time_of_min_OD, -time_of_min_death, -end_point_timepoint_OD, -end_point_timepoint_death, -min_OD, -min_death, -doubling_time_OD, -doubling_time_death ) %>% 
  merge(., sample_meta.df, by = "sample_id") %>%
  mutate(ST_simp = fct_lump(ST, n = 5)) %>%
  filter(!is.na(mortality)) 
  

median_param_dat.df <- median_sample_parameters_all.df %>%
  select_if(grepl("OD", names(.)) | grepl("death", names(.)))

Check each variable individually

for (var in  str_extract(colnames(median_sample_parameters_all.df), ".*OD*") %>% na.omit()) {
  t<-ggplot(median_sample_parameters_all.df, aes_string(x = var))+
    geom_density(aes(fill = mortality), alpha = .5)
  print(t)
}

for (var in  str_extract(colnames(median_sample_parameters_all.df), ".*death*") %>% na.omit()) {
  t<-ggplot(median_sample_parameters_all.df, aes_string(x = var))+
    geom_density(aes(fill = mortality), alpha = .5)
  print(t)
}

for (var in  str_extract(colnames(median_sample_parameters_all.df), ".*OD*") %>% na.omit()) {
  t <- ggviolin(data = median_sample_parameters_all.df,
          y = var,
          x = "mortality",
          fill = "mortality") +
  theme_bw() +
  theme(legend.position = "none")+
  stat_compare_means(ref.group ="Survived",
                     method = "wilcox.test",
                     label = "p.signif")
  print(t)
}

for (var in  str_extract(colnames(median_sample_parameters_all.df), ".*death*") %>% na.omit()) {
  t <- ggviolin(data = median_sample_parameters_all.df,
          y = var,
          x = "mortality",
          fill = "mortality") +
  theme_bw() +
  theme(legend.position = "none")+
  stat_compare_means(ref.group ="Survived",
                     method = "wilcox.test",
                     label = "p.signif")
  print(t)
}

Check vriable correlation with correlogram

M <-cor(median_param_dat.df, method = "pearson")
corrplot(M, type="upper")

M <-cor(median_param_dat.df, method = "kendall")
corrplot(M, type="upper")

M <-cor(median_param_dat.df, method = "spearman")
corrplot(M, type="upper")

Format parameter df for PCA

pca.df <- median_param_dat.df
row.names(pca.df) <- median_sample_parameters_all.df$sample_id

pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T)

Plot % variance explained per axes

fviz_screeplot(pca1, addlabels = T)

Plot individual and variable loading with group

# mortality
fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$mortality,
             col.circle = median_sample_parameters_all.df$mortality ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$mortality,
             col.circle = median_sample_parameters_all.df$mortality ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

#ST
fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$ST_simp,
             col.circle = median_sample_parameters_all.df$ST_simp ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$ST_simp,
             col.circle = median_sample_parameters_all.df$ST_simp ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

Contributions of variables to PC1

fviz_contrib(pca1, choice = "var", axes = 1)

Contributions of variables to PC2

fviz_contrib(pca1, choice = "var", axes = 2)

Contribution of variables to PC1 to 4

fviz_contrib(pca1, choice = "var", axes = 1:4)

Rerun PCA with variable with max contrib.

pca.df <- median_param_dat.df %>%
  select(max_death, end_point_OD, AUC_death, end_point_death, max_OD, AUC_OD, max_rate_death) %>%
#  select(AUC_death, AUC_OD, max_OD, max_rate_death, max_rate_OD, doubling_time_OD) %>%
  filter(!is.na(median_sample_parameters_all.df$mortality))
#row.names(pca.df) <- median_sample_parameters_all.df$sample_id

pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T, center = T)

Plot % variance explained per axes

fviz_screeplot(pca1, addlabels = T)

Plot individual and variable loading with group

# mortality
fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .75,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .75,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .75,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .75,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

Compute Discriminant Analysis of Principal Components (DAPC)

pca.df <- median_param_dat.df %>%
   filter(!is.na(median_sample_parameters_all.df$mortality))
  
row.names(pca.df) <- median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$sample_id

# All vs All

dapc1 <- adegenet::dapc(x = pca.df,
                        grp = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality == "Died",
                        n.pca = 6, 
                        n.da = 2, 
                        scale = T,
                        center = T)
summary(dapc1)
## $n.dim
## [1] 1
## 
## $n.pop
## [1] 2
## 
## $assign.prop
## [1] 0.8
## 
## $assign.per.pop
##      FALSE       TRUE 
## 0.99295775 0.07894737 
## 
## $prior.grp.size
## 
## FALSE  TRUE 
##   142    38 
## 
## $post.grp.size
## 
## FALSE  TRUE 
##   176     4
scatter.dapc(dapc1)

dapc1$var.contr
##                                 LD1
## AUC_OD                 6.457533e-03
## end_point_OD           1.040002e-01
## max_OD                 4.491477e-02
## max_rate_OD            4.053661e-01
## time_of_max_OD         1.198624e-01
## time_of_max_rate_OD    3.476192e-02
## AUC_death              3.201048e-03
## end_point_death        2.978451e-06
## max_death              1.021493e-02
## max_rate_death         6.925194e-02
## time_of_max_death      8.697950e-02
## time_of_max_rate_death 1.149867e-01
loadingplot(dapc1$var.contr)

dapc0 <-data.frame(comparison = "All vs ALL", 
                   var = row.names(dapc1$var.contr), 
                   contrib = as.character(dapc1$var.contr), 
                   row.names = NULL)

reconpute PCA with only best mortality discriminating variable

pca.df <- pca.df %>%
  select(time_of_max_OD, max_rate_OD, time_of_max_rate_death )

pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T, center = T)

Plot % variance explained per axes

fviz_screeplot(pca1, addlabels = T)

Plot individual and variable loading with group

# mortality
fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

Try random forest classifier

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# add sample_id number (ID) to median_sample_parameters_all.df
median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
  mutate(ID = row.names(.))

rf.df <- median_sample_parameters_all.df %>%
  select_if(grepl("OD", names(.)) | grepl("death", names(.))) %>%
  mutate(mortality = median_sample_parameters_all.df$mortality) %>%
  mutate(ID = median_sample_parameters_all.df$ID) %>%
  mutate(ST_simp =  median_sample_parameters_all.df$ST_simp)

# can filter or sample here
rf.df <- rf.df %>%
  filter(!is.na(mortality))

rownames(rf.df) <- rf.df$ID


set.seed(100)
# train <- sample(nrow(EM.rf.df), 0.7*nrow(EM.rf.df), replace = FALSE)
# TrainSet <- EM.rf.df[train,]
# ValidSet <- EM.rf.df[-train,]

pred_rf.df <- rf.df %>% select(-mortality, -ID, -ST_simp)
resp_rf.vec <- rf.df$mortality

model1 <- randomForest(x = pred_rf.df, y= resp_rf.vec, data = EM.rf.df, importance = TRUE, ntree = 300)

model1
## 
## Call:
##  randomForest(x = pred_rf.df, y = resp_rf.vec, ntree = 300, importance = TRUE,      data = EM.rf.df) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 15%
## Confusion matrix:
##          Died Survived class.error
## Died       13       25  0.65789474
## Survived    2      140  0.01408451
varImpPlot(model1)

# Predicting on Validation set
predValid <- predict(model1, rf.df, type = "class")
predValid <- as.data.frame(predValid) %>% 
  mutate(ID = rownames(.)) %>%
  rename(Prediction = predValid) %>%
  merge(., rf.df %>% select(ID, mortality),  by = "ID")%>%
  droplevels() %>%
  mutate(Accurate = ifelse(Prediction == mortality, T, F))

# Predict class proba
predValid_prob <- predict(model1, rf.df, type="prob")%>%
  as.data.frame(.) %>%
  mutate(ID = rownames(.)) %>%
  merge(., predValid)

# Checking classification accuracy
mean(predValid$Accurate)                    
## [1] 1
table(predValid$Prediction, predValid$mortality)
##           
##            Died Survived
##   Died       38        0
##   Survived    0      142
table(predValid$mortality, predValid$Accurate)
##           
##            TRUE
##   Died       38
##   Survived  142
# Keep n best predicted for each strain
best_pred.df <- predValid_prob %>%
  filter(Accurate == T) %>%
  mutate(Accurate_proba = pmax(Died, Survived)) %>%
  group_by(mortality) %>%
  #filter(Accurate_proba > 0.9)
  top_frac(.5, Accurate_proba)
  
count(best_pred.df, mortality)  
## # A tibble: 2 x 2
## # Groups:   mortality [2]
##   mortality     n
##   <fct>     <int>
## 1 Died         19
## 2 Survived     72

Re-compute PCA with best accurately classified

pca_meta.df3 <-  rf.df %>%
  filter(ID %in% best_pred.df$ID) 

pca.df3 <- pca_meta.df3 %>%
  select(-ID, -mortality, -ST_simp)

pca3 <- dudi.pca(df = pca.df3, scannf = F, nf = 5, scale = T)

Plot PCA

fviz_screeplot(pca3, addlabels = T)

fviz_pca_biplot(pca3,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df3$mortality,
             col.circle = pca_meta.df3$mortality,
             title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
             repel = T,
             pointshape = 16)

fviz_pca_biplot(pca3,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df3$mortality,
             col.circle = pca_meta.df3$mortality,
             title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
             repel = T,
             pointshape = 16)

fviz_pca_biplot(pca3,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .8,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df3$ST_simp,
             col.circle = pca_meta.df3$ST_simp,
             title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
             repel = T,
             pointshape = 16)

fviz_pca_biplot(pca3,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .95,
             addEllipses = T,
             #ellipse.level = .99,
             col.ind = pca_meta.df3$ST_simp,
             col.circle = pca_meta.df3$ST_simp,
             title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
             repel = T,
             pointshape = 16)

Try random forest unsupervised

# library(randomForest)


# add sample_id number (ID) to median_sample_parameters_all.df
median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
  mutate(ID = row.names(.))

rf.df <- median_sample_parameters_all.df %>%
  select_if(grepl("OD", names(.)) | grepl("death", names(.))) %>%
  mutate(mortality = median_sample_parameters_all.df$mortality) %>%
  mutate(ID = median_sample_parameters_all.df$ID) %>%
  mutate(ST_simp =  median_sample_parameters_all.df$ST_simp) %>%
  mutate(sample_id = median_sample_parameters_all.df$sample_id)

# can filter or sample here
rf.df <- rf.df %>%
  filter(!is.na(mortality))

rownames(rf.df) <- rf.df$ID


set.seed(100)
# train <- sample(nrow(EM.rf.df), 0.7*nrow(EM.rf.df), replace = FALSE)
# TrainSet <- EM.rf.df[train,]
# ValidSet <- EM.rf.df[-train,]

pred_rf.df <- rf.df %>% select(-mortality, -ID, -ST_simp, -sample_id)
resp_rf.vec <- rf.df$mortality

model1 <- randomForest(x = pred_rf.df, data = EM.rf.df, importance = TRUE, ntree = 100)

model1
## 
## Call:
##  randomForest(x = pred_rf.df, ntree = 100, importance = TRUE,      data = EM.rf.df) 
##                Type of random forest: unsupervised
##                      Number of trees: 100
## No. of variables tried at each split: 3
varImpPlot(model1)

prox <- model1$proximity
pam.rf <- pam(prox, 4)
pred <- cbind(pam.rf$clustering, as.character(rf.df$sample_id), as.character(rf.df$ST_simp), as.character(rf.df$mortality))
table(pred[,1], pred[,2])
##    
##     BPH2700 BPH2701 BPH2702 BPH2703 BPH2704 BPH2705 BPH2706 BPH2707
##   1       1       0       0       0       0       0       0       1
##   2       0       1       1       1       0       0       0       0
##   3       0       0       0       0       1       1       1       0
##   4       0       0       0       0       0       0       0       0
##    
##     BPH2708 BPH2709 BPH2710 BPH2711 BPH2712 BPH2713 BPH2714 BPH2715
##   1       0       0       0       0       0       0       0       0
##   2       0       1       1       1       0       0       0       0
##   3       0       0       0       0       1       0       0       0
##   4       1       0       0       0       0       1       1       1
##    
##     BPH2716 BPH2717 BPH2718 BPH2719 BPH2720 BPH2721 BPH2722 BPH2723
##   1       1       0       0       1       0       0       0       0
##   2       0       0       0       0       0       0       1       0
##   3       0       1       0       0       0       1       0       0
##   4       0       0       1       0       1       0       0       1
##    
##     BPH2724 BPH2725 BPH2726 BPH2727 BPH2728 BPH2729 BPH2730 BPH2731
##   1       0       0       0       0       0       0       0       0
##   2       0       0       1       1       0       0       0       1
##   3       0       1       0       0       0       0       0       0
##   4       1       0       0       0       1       1       1       0
##    
##     BPH2732 BPH2733 BPH2734 BPH2735 BPH2736 BPH2737 BPH2738 BPH2739
##   1       1       1       0       0       0       0       0       0
##   2       0       0       0       0       0       0       0       1
##   3       0       0       0       0       0       0       0       0
##   4       0       0       1       1       1       1       1       0
##    
##     BPH2740 BPH2741 BPH2742 BPH2743 BPH2744 BPH2745 BPH2746 BPH2747
##   1       0       0       0       0       0       0       0       0
##   2       1       0       0       0       0       0       0       1
##   3       0       0       0       0       0       0       0       0
##   4       0       1       1       1       1       1       1       0
##    
##     BPH2748 BPH2749 BPH2751 BPH2752 BPH2753 BPH2754 BPH2755 BPH2756
##   1       0       1       0       0       1       0       0       1
##   2       1       0       0       0       0       0       0       0
##   3       0       0       0       0       0       0       0       0
##   4       0       0       1       1       0       1       1       0
##    
##     BPH2757 BPH2758 BPH2759 BPH2760 BPH2761 BPH2763 BPH2764 BPH2765
##   1       0       0       0       0       0       0       1       0
##   2       0       1       0       0       0       1       0       1
##   3       1       0       1       1       0       0       0       0
##   4       0       0       0       0       1       0       0       0
##    
##     BPH2766 BPH2767 BPH2768 BPH2769 BPH2770 BPH2771 BPH2773 BPH2774
##   1       0       0       0       0       0       0       0       0
##   2       1       0       0       1       0       0       0       1
##   3       0       0       1       0       0       0       0       0
##   4       0       1       0       0       1       1       1       0
##    
##     BPH2775 BPH2776 BPH2777 BPH2778 BPH2779 BPH2780 BPH2781 BPH2782
##   1       0       0       0       0       0       0       0       0
##   2       1       0       0       0       0       1       1       0
##   3       0       0       0       0       0       0       0       0
##   4       0       1       1       1       1       0       0       1
##    
##     BPH2783 BPH2784 BPH2785 BPH2786 BPH2787 BPH2788 BPH2789 BPH2790
##   1       0       0       0       0       1       0       0       0
##   2       1       0       0       0       0       0       0       0
##   3       0       0       1       1       0       0       0       0
##   4       0       1       0       0       0       1       1       1
##    
##     BPH2791 BPH2792 BPH2793 BPH2794 BPH2795 BPH2796 BPH2797 BPH2798
##   1       0       1       0       0       0       0       1       0
##   2       0       0       0       0       0       0       0       0
##   3       0       0       0       1       1       1       0       0
##   4       1       0       1       0       0       0       0       1
##    
##     BPH2799 BPH2800 BPH2801 BPH2802 BPH2803 BPH2804 BPH2805 BPH2806
##   1       0       0       0       0       0       0       1       0
##   2       0       0       0       0       0       1       0       0
##   3       1       0       1       1       1       0       0       1
##   4       0       1       0       0       0       0       0       0
##    
##     BPH2807 BPH2808 BPH2809 BPH2810 BPH2811 BPH2840 BPH2841 BPH2842
##   1       0       0       0       0       0       1       1       0
##   2       0       0       0       0       1       0       0       1
##   3       0       0       0       1       0       0       0       0
##   4       1       1       1       0       0       0       0       0
##    
##     BPH2843 BPH2844 BPH2845 BPH2846 BPH2847 BPH2848 BPH2849 BPH2850
##   1       0       1       1       0       0       1       1       1
##   2       1       0       0       0       1       0       0       0
##   3       0       0       0       0       0       0       0       0
##   4       0       0       0       1       0       0       0       0
##    
##     BPH2851 BPH2852 BPH2853 BPH2854 BPH2855 BPH2856 BPH2857 BPH2858
##   1       0       0       0       0       0       0       0       0
##   2       1       0       0       1       1       0       0       0
##   3       0       0       0       0       0       0       0       0
##   4       0       1       1       0       0       1       1       1
##    
##     BPH2859 BPH2860 BPH2861 BPH2862 BPH2863 BPH2864 BPH2865 BPH2866
##   1       1       1       1       1       0       0       1       0
##   2       0       0       0       0       0       0       0       1
##   3       0       0       0       0       0       0       0       0
##   4       0       0       0       0       1       1       0       0
##    
##     BPH2867 BPH2902 BPH2904 BPH2942 BPH2943 BPH2962 BPH2963 BPH3336
##   1       0       0       0       1       0       0       0       0
##   2       0       0       0       0       0       0       1       0
##   3       0       0       0       0       0       0       0       0
##   4       1       1       1       0       1       1       0       1
##    
##     BPH3337 BPH3427 BPH3455 BPH3478 BPH3483 BPH3504 BPH3506 BPH3515
##   1       0       0       0       1       0       1       0       0
##   2       1       1       0       0       1       0       0       1
##   3       0       0       0       0       0       0       0       0
##   4       0       0       1       0       0       0       1       0
##    
##     BPH3521 BPH3525 BPH3533 BPH3535 BPH3541 BPH3549 BPH3560 BPH3563
##   1       1       1       0       0       0       0       0       0
##   2       0       0       0       1       0       0       1       0
##   3       0       0       0       0       0       0       0       0
##   4       0       0       1       0       1       1       0       1
##    
##     BPH3572 BPH3594 BPH3617 BPH3627 BPH3644 BPH3646 BPH3676 BPH3680
##   1       1       0       1       1       0       0       0       0
##   2       0       0       0       0       0       0       0       0
##   3       0       0       0       0       0       0       0       0
##   4       0       1       0       0       1       1       1       1
##    
##     BPH3681 BPH3698 BPH3706 BPH3708 BPH3712 BPH3723 BPH3725 BPH3733
##   1       1       0       0       0       0       1       0       0
##   2       0       0       1       1       0       0       0       1
##   3       0       0       0       0       0       0       0       0
##   4       0       1       0       0       1       0       1       0
##    
##     BPH3734 BPH3747 BPH3752 BPH3757
##   1       0       0       0       0
##   2       1       1       0       1
##   3       0       0       0       0
##   4       0       0       1       0
table(pred[,1], pred[,3])
##    
##      - 22 239 45  5 Other
##   1  5  0   2  4  4    21
##   2  3 10  12  1  2    18
##   3  1  0   1  3  3    14
##   4 11  9  16 11  6    23
table(pred[,1], pred[,4])
##    
##     Died Survived
##   1   10       26
##   2    9       37
##   3    1       21
##   4   18       58
cluster.df <-cbind(pam.rf$clustering, as.character(rf.df$sample_id)) %>% as.data.frame()
colnames(cluster.df) <- c("cluster", "sample_id")
  
pca_meta.df4 <-  rf.df %>%
  merge(.,  cluster.df ,  by = "sample_id") 

pca.df4 <- pca_meta.df4 %>%
  select(-sample_id, -cluster, -ST_simp, -mortality, -ID)

pca4 <- dudi.pca(df = pca.df4, scannf = F, nf = 5, scale = T)



fviz_screeplot(pca4, addlabels = T)

fviz_pca_biplot(pca4,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df4$cluster,
             title = paste("" ),
             repel = T,
             pointshape = 16)

fviz_pca_biplot(pca4,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df4$mortality,
             title = paste("" ),
             repel = T,
             pointshape = 16)